This program utilizes financial Python programming and unsupervised learning to cluster cryptocurrencies based on their performance in different time periods, utilizing K-Means and Principal Component Analysis (PCA). The dataset used in this analysis is a CSV file containing returns (price change) data of cryptocurrencies across various periods.
The analysis process is divided into the following steps:
Data import Data preparation Determining the optimal value of 'k' using the original dataset Cryptocurrency clustering with K-means using the original dataset Optimization of clusters using Principal Component Analysis Determining the optimal value of 'k' using the PCA dataset Cryptocurrency clustering with K-means using the PCA dataset Visualization and comparison of results. Overall, this code is designed to cluster cryptocurrencies in a structured and systematic manner, using a combination of financial Python programming and unsupervised learning techniques, in order to gain insights into their performance in different time periods.
# dependencies and setup
from package.constants import * # constants
from package.helpers import * # libraries, dependencies and functions
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(DATA_URL+"crypto_market_data.csv", index_col="coin_id")
# Display sample data
df_market_data.head()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| coin_id | |||||||
| bitcoin | 1.08388 | 7.60278 | 6.57509 | 7.67258 | -3.25185 | 83.51840 | 37.51761 |
| ethereum | 0.22392 | 10.38134 | 4.80849 | 0.13169 | -12.88890 | 186.77418 | 101.96023 |
| tether | -0.21173 | 0.04935 | 0.00640 | -0.04237 | 0.28037 | -0.00542 | 0.01954 |
| ripple | -0.37819 | -0.60926 | 2.24984 | 0.23455 | -17.55245 | 39.53888 | -16.60193 |
| bitcoin-cash | 2.90585 | 17.09717 | 14.75334 | 15.74903 | -13.71793 | 21.66042 | 14.49384 |
# Generate the DF info
df_market_data.info()
<class 'pandas.core.frame.DataFrame'> Index: 41 entries, bitcoin to digibyte Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price_change_percentage_24h 41 non-null float64 1 price_change_percentage_7d 41 non-null float64 2 price_change_percentage_14d 41 non-null float64 3 price_change_percentage_30d 41 non-null float64 4 price_change_percentage_60d 41 non-null float64 5 price_change_percentage_200d 41 non-null float64 6 price_change_percentage_1y 41 non-null float64 dtypes: float64(7) memory usage: 2.6+ KB
# Generate the summary statistics
df_market_data.describe(include = 'all').round(2)
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| count | 41.00 | 41.00 | 41.00 | 41.00 | 41.00 | 41.00 | 41.00 |
| mean | -0.27 | 4.50 | 0.19 | 1.55 | -0.09 | 236.54 | 347.67 |
| std | 2.69 | 6.38 | 8.38 | 26.34 | 47.37 | 435.23 | 1247.84 |
| min | -13.53 | -6.09 | -18.16 | -34.71 | -44.82 | -0.39 | -17.57 |
| 25% | -0.61 | 0.05 | -5.03 | -10.44 | -25.91 | 21.66 | 0.41 |
| 50% | -0.06 | 3.30 | 0.11 | -0.04 | -7.54 | 83.91 | 69.69 |
| 75% | 0.61 | 7.60 | 5.51 | 4.58 | 0.66 | 216.18 | 168.37 |
| max | 4.84 | 20.69 | 24.24 | 140.80 | 223.06 | 2227.93 | 7852.09 |
line(df_market_data,"Price Change Over Time")
# Use the StandardScaler() module to normalize the data from the CSV file
data_scaled = StandardScaler().fit_transform(df_market_data)
# Create a DF for the scaled data
df_market_scaled = pd.DataFrame(data_scaled, columns=df_market_data.columns, index=df_market_data.index)
df_market_scaled.head()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| coin_id | |||||||
| bitcoin | 0.508529 | 0.493193 | 0.772200 | 0.235460 | -0.067495 | -0.355953 | -0.251637 |
| ethereum | 0.185446 | 0.934445 | 0.558692 | -0.054341 | -0.273483 | -0.115759 | -0.199352 |
| tether | 0.021774 | -0.706337 | -0.021680 | -0.061030 | 0.008005 | -0.550247 | -0.282061 |
| ripple | -0.040764 | -0.810928 | 0.249458 | -0.050388 | -0.373164 | -0.458259 | -0.295546 |
| bitcoin-cash | 1.193036 | 2.000959 | 1.760610 | 0.545842 | -0.291203 | -0.499848 | -0.270317 |
df_market_scaled.describe(include = 'all').round(2)
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| count | 41.00 | 41.00 | 41.00 | 41.00 | 41.00 | 41.00 | 41.00 |
| mean | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | -0.00 | 0.00 |
| std | 1.01 | 1.01 | 1.01 | 1.01 | 1.01 | 1.01 | 1.01 |
| min | -4.98 | -1.68 | -2.22 | -1.39 | -0.96 | -0.55 | -0.30 |
| 25% | -0.13 | -0.71 | -0.63 | -0.46 | -0.55 | -0.50 | -0.28 |
| 50% | 0.08 | -0.19 | -0.01 | -0.06 | -0.16 | -0.36 | -0.23 |
| 75% | 0.33 | 0.49 | 0.64 | 0.12 | 0.02 | -0.05 | -0.15 |
| max | 1.92 | 2.57 | 2.91 | 5.35 | 4.77 | 4.63 | 6.09 |
line(df_market_scaled,"Standardized Price Change Over Time")
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 4, n_init = 25, random_state = 1234)
km.fit(df_market_scaled)
KMeans(n_clusters=4, n_init=25, random_state=1234)
km.labels_
array([0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 1, 3, 1, 1, 1, 1])
km.inertia_
79.02243535120977
pd.Series(km.labels_).value_counts().sort_index()
0 13 1 26 2 1 3 1 dtype: int64
km.cluster_centers_
array([[ 0.23756041, 1.19101578, 0.83462785, 0.19065425, -0.212313 ,
-0.22264199, -0.20800783],
[ 0.03258562, -0.56998841, -0.4827023 , -0.25453208, -0.07913767,
-0.18795807, -0.18203893],
[-4.98104189, -0.04517829, -1.20695612, -1.21212587, 0.04773554,
4.63238025, 6.08862545],
[ 1.04553034, -0.61832816, 2.90705393, 5.35145461, 4.76991278,
3.14887546, 1.34848839]])
cluster_centers = pd.DataFrame(km.cluster_centers_, columns=df_market_data.columns)
cluster_centers
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| 0 | 0.237560 | 1.191016 | 0.834628 | 0.190654 | -0.212313 | -0.222642 | -0.208008 |
| 1 | 0.032586 | -0.569988 | -0.482702 | -0.254532 | -0.079138 | -0.187958 | -0.182039 |
| 2 | -4.981042 | -0.045178 | -1.206956 | -1.212126 | 0.047736 | 4.632380 | 6.088625 |
| 3 | 1.045530 | -0.618328 | 2.907054 | 5.351455 | 4.769913 | 3.148875 | 1.348488 |
import plotly.graph_objs as go # Import the plotly library
# Create the trace for the data points
trace_points = go.Scatter(
x=df_market_scaled['price_change_percentage_24h'],
y=df_market_scaled['price_change_percentage_7d'],
mode='markers',
name='Coins',
marker=dict(
size=12.5,
color=km.labels_,
colorscale=["blue", "yellow", "red","green"],
opacity=0.9,
line=dict(
width=1,
color='black'
)
),
text=df_market_scaled.index # Set the hover text to the index value
)
# Create the trace for the centroid points
trace_centroids = go.Scatter(
x=cluster_centers['price_change_percentage_24h'],
y=cluster_centers['price_change_percentage_7d'],
mode='markers',
name='Cluster Centers',
marker=dict(
size=25,
color=cluster_centers.index,
colorscale=["blue", "yellow", "red","green"],
symbol='diamond',
opacity=0.5,
line=dict(
width=1,
color='black'
)
),
text=[f"Centroid {i}" for i in range(len(cluster_centers))] # Set the hover text to "Centroid {i}"
)
# Define the layout of the plot
layout = go.Layout(
legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01,
bgcolor= '#f7f7f7',
font=dict(color='black', size=14)
),
width=800,
height=800,
title=dict(text="GO Plot",
font=dict(size= 24, color= 'black'),
x=0.5,
y=0.91),
xaxis=dict(title='Price Change Percentage 24h',
showline=True,
linewidth=1.5,
linecolor='black',
mirror=True,
color= 'black',
gridcolor='white'),
yaxis=dict(title='Price Change Percentage 7d',
showline=True,
linewidth=1.5,
linecolor='black',
mirror=True,
color= 'black',
gridcolor='white'),
hovermode='closest',
plot_bgcolor='#ffffff',
paper_bgcolor="#f7f7f7"
)
# Create the figure object and add the traces to it
fig = go.Figure(data=[trace_points, trace_centroids], layout=layout)
# Show the figure
fig.show()
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
labels = cluster.fit_predict(df_market_scaled)
df = pd.DataFrame(df_market_scaled, columns=['price_change_percentage_24h', 'price_change_percentage_7d',
'price_change_percentage_14d', 'price_change_percentage_30d',
'price_change_percentage_60d', 'price_change_percentage_200d',
'price_change_percentage_1y'])
df['cluster'] = labels
# calculate the mean values for each cluster
cluster_centers = df.groupby('cluster').mean()
cluster_centers
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| cluster | |||||||
| 0 | 0.001893 | -0.815288 | -1.225777 | -0.370342 | 0.000158 | 0.226629 | -0.065063 |
| 1 | 0.074391 | -0.096739 | 0.147830 | -0.100759 | -0.201319 | -0.414732 | -0.249308 |
| 2 | -4.981042 | -0.045178 | -1.206956 | -1.212126 | 0.047736 | 4.632380 | 6.088625 |
| 3 | 1.045530 | -0.618328 | 2.907054 | 5.351455 | 4.769913 | 3.148875 | 1.348488 |
| 4 | 0.367596 | 1.840230 | 1.192931 | 0.313590 | -0.031483 | -0.084783 | -0.175400 |
plt.figure(figsize=(10, 8))
ax = sns.scatterplot(data = df_market_scaled,
x = 'price_change_percentage_24h',
y = 'price_change_percentage_7d',
hue = cluster.labels_,
palette = "bright",
alpha = 0.8,
s = 150,
legend = False)
#Plot the Centroids
ax = sns.scatterplot(data = cluster_centers,
x = 'price_change_percentage_24h',
y = 'price_change_percentage_7d',
hue = cluster_centers.index,
palette = 'colorblind',
alpha = 0.4,
s = 600,
marker = 'D',
ec = 'black',
legend = False)
# Add Centroid Labels
for i in range(len(cluster_centers)):
plt.text(x = cluster_centers.price_change_percentage_24h[i],
y = cluster_centers.price_change_percentage_7d[i],
s = i,
horizontalalignment='center',
verticalalignment='center',
size = 15,
alpha = 0.4,
weight = 'bold',
color = 'k')
wcss = []
for k in range(2, 11):
km = KMeans(n_clusters = k, n_init = 25, random_state = 1234)
km.fit(df_market_scaled)
wcss.append(km.inertia_)
wcss_series = pd.Series(wcss, index = range(2, 11))
plt.figure(figsize=(8, 6))
ax = sns.lineplot(y = wcss_series, x = wcss_series.index)
ax = sns.scatterplot(y = wcss_series, x = wcss_series.index, s = 150)
ax = ax.set(xlabel = 'Number of Clusters (k)',
ylabel = 'Within Cluster Sum of Squares (WCSS)')
from sklearn.metrics import silhouette_score
silhouette = []
for k in range(2, 11):
km = KMeans(n_clusters = k, n_init = 25, random_state = 1234)
km.fit(df_market_scaled)
silhouette.append(silhouette_score(df_market_scaled, km.labels_))
silhouette_series = pd.Series(silhouette, index = range(2, 11))
plt.figure(figsize=(8, 6))
ax = sns.lineplot(y = silhouette_series, x = silhouette_series.index)
ax = sns.scatterplot(y = silhouette_series, x = silhouette_series.index, s = 150)
ax = ax.set(xlabel = 'Number of Clusters (k)',
ylabel = 'Average Silhouette Score')
from sklearn.metrics import calinski_harabasz_score
calinski = []
for k in range(2, 11):
km = KMeans(n_clusters = k, n_init = 25, random_state = 1234)
km.fit(df_market_scaled)
calinski.append(calinski_harabasz_score(df_market_scaled, km.labels_))
calinski_series = pd.Series(calinski, index = range(2, 11))
plt.figure(figsize=(8, 6))
ax = sns.lineplot(y = calinski_series, x = calinski_series.index)
ax = sns.scatterplot(y = calinski_series, x = calinski_series.index, s = 150)
ax = ax.set(xlabel = 'Number of Clusters (k)',
ylabel = 'Calinski Harabasz Score')
It looks like 4 is the best value of k!
# Create a PCA model instance and set n_components = 3
pca=PCA(n_components=3)
# Use the PCA model with `fit_transform` to reduce to
# three principal components.
df_market_pca =pd.DataFrame(pca.fit_transform(df_market_scaled)).head(5)
# View the first five rows of the DataFrame.
df_market_pca
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | -0.600667 | 0.842760 | 0.461595 |
| 1 | -0.458261 | 0.458466 | 0.952877 |
| 2 | -0.433070 | -0.168126 | -0.641752 |
| 3 | -0.471835 | -0.222660 | -0.479053 |
| 4 | -1.157800 | 2.041209 | 1.859715 |
# Retrieve the explained variance to determine how much information
# can be attributed to each principal component.
pca.explained_variance_ratio_
array([0.3719856 , 0.34700813, 0.17603793])
sum(pca.explained_variance_ratio_)
0.8950316570309841
Together, the two principal components contain about 96% of the information. The first principal component contains about 73% of the variance. The second principal component contains about 23% of the variance.
# Create column names for the PCA components
pca_columns = ['PCA{}'.format(i) for i in range(1, 4)]
# Perform PCA on the scaled data and create a DataFrame with the results
pca = PCA(n_components=3)
market_pca = pca.fit_transform(df_market_scaled)
market_pca_df = pd.DataFrame(market_pca, columns=pca_columns, index=df_market_data.index)
# Display the first few rows of the DataFrame
market_pca_df.head()
| PCA1 | PCA2 | PCA3 | |
|---|---|---|---|
| coin_id | |||
| bitcoin | -0.600667 | 0.842760 | 0.461595 |
| ethereum | -0.458261 | 0.458466 | 0.952877 |
| tether | -0.433070 | -0.168126 | -0.641752 |
| ripple | -0.471835 | -0.222660 | -0.479053 |
| bitcoin-cash | -1.157800 | 2.041209 | 1.859715 |
inertia = []
# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for k in range(2, 11):
k_model = KMeans(n_clusters=k, random_state=1)
k_model.fit(market_pca_df)
inertia.append(k_model.inertia_)
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": range(2, 11), "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
# Review the DataFrame
df_elbow.head()
plt.figure(figsize=(8, 6))
ax = sns.lineplot(y = df_elbow["inertia"], x = df_elbow["k"])
ax = sns.scatterplot(y = df_elbow["inertia"], x = df_elbow["k"], s = 150)
ax = ax.set(xlabel = 'Number of Clusters (k)',
ylabel = 'Within Cluster Sum of Squares (WCSS)')
plt.axvline(x=4, color='k', linestyle='--')
<matplotlib.lines.Line2D at 0x14410f8a6d0>
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(market_pca_df)
# Make predictions
k_3 = model.predict(market_pca_df)
# Create a copy of the PCA DataFrame
market_pca_df_predictions_df = market_pca_df.copy()
# Add a class column with the labels
market_pca_df_predictions_df["segments"] = k_3
market_pca_df_predictions_df
| PCA1 | PCA2 | PCA3 | segments | |
|---|---|---|---|---|
| coin_id | ||||
| bitcoin | -0.600667 | 0.842760 | 0.461595 | 0 |
| ethereum | -0.458261 | 0.458466 | 0.952877 | 0 |
| tether | -0.433070 | -0.168126 | -0.641752 | 3 |
| ripple | -0.471835 | -0.222660 | -0.479053 | 3 |
| bitcoin-cash | -1.157800 | 2.041209 | 1.859715 | 0 |
| binancecoin | -0.516534 | 1.388377 | 0.804071 | 0 |
| chainlink | -0.450711 | 0.517699 | 2.846143 | 0 |
| cardano | -0.345600 | 0.729439 | 1.478013 | 0 |
| litecoin | -0.649468 | 0.432165 | 0.600303 | 0 |
| bitcoin-cash-sv | -0.759014 | -0.201200 | -0.217653 | 3 |
| crypto-com-chain | -0.248198 | -1.376252 | -1.462026 | 3 |
| usd-coin | -0.438408 | -0.175337 | -0.663388 | 3 |
| eos | -0.693425 | -0.473815 | -0.527597 | 3 |
| monero | 0.060499 | 2.909404 | 1.498571 | 0 |
| tron | -0.393352 | -0.108192 | -0.012756 | 3 |
| tezos | -0.796176 | -0.494409 | 1.082812 | 0 |
| okb | 0.064075 | -1.269825 | -1.098829 | 3 |
| stellar | -0.489015 | -0.732719 | -0.062543 | 3 |
| cosmos | -0.306272 | 0.703415 | 1.714224 | 0 |
| cdai | -0.513528 | -0.142802 | -0.656566 | 3 |
| neo | -0.362120 | -0.986914 | -0.728752 | 3 |
| wrapped-bitcoin | -0.604265 | 0.827398 | 0.439316 | 0 |
| leo-token | -0.413296 | -0.674115 | -1.076628 | 3 |
| huobi-token | -0.407483 | -0.212507 | -0.351426 | 3 |
| nem | 0.608974 | 0.563532 | -1.148742 | 3 |
| binance-usd | -0.450211 | -0.151019 | -0.647401 | 3 |
| iota | -0.764665 | -0.517886 | 0.204990 | 3 |
| vechain | -0.556315 | -1.938209 | -1.261776 | 3 |
| zcash | -0.425147 | 0.492976 | 1.058048 | 0 |
| theta-token | 2.676868 | -0.013954 | -1.965207 | 3 |
| dash | -0.613923 | -0.479337 | 0.339565 | 3 |
| ethereum-classic | -0.579924 | -0.356334 | -0.114942 | 3 |
| ethlend | 8.089018 | -3.896891 | 2.301382 | 1 |
| maker | -0.389045 | 0.165041 | 0.379414 | 0 |
| havven | 0.865762 | -2.261882 | 0.275583 | 3 |
| omisego | 0.111675 | 0.428316 | -1.205398 | 3 |
| celsius-degree-token | 4.792395 | 6.767679 | -1.986985 | 2 |
| ontology | -0.632355 | -2.108117 | -0.652227 | 3 |
| ftx-token | -0.593142 | 0.021485 | 0.209911 | 3 |
| true-usd | -0.458131 | -0.135734 | -0.635284 | 3 |
| digibyte | -0.297910 | -0.191126 | -0.909602 | 3 |
# Plot the clusters
market_pca_df_predictions_df.hvplot.scatter(
x="PCA2",
y="PCA3",
by="segments"
)
The best k value is 4!
The best value of k for both the original and pca data are the same! However, the elbow curve for the pca data has been translated down (smaller y-intercept) meaning that the inertia is lower for the pca k values.
# Initialize the kmeans model
# Fit the model with the market pca data
# Store the predictions
model_pca = KMeans(n_clusters=4)
model_pca.fit(market_pca_df)
predictions_pca = model_pca.predict(market_pca_df)
predictions_pca
array([0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 3, 0,
3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 1, 0, 3, 3, 2, 3, 3, 3, 3])
# Make a copy of the market df (NOT pca)
# Make a column for the predictions using the pca data
# Will compare clusters
df_market_pca_predictions = df_market_predictions.copy()
df_market_pca_predictions["predictions_pca"] = predictions_pca
df_market_pca_predictions.head(5)
# Make a copy of the market pca df
df_pca_predictions = market_pca_df.copy()
df_pca_predictions["predictions_pca"] = predictions_pca
df_pca_predictions.index = i
df_pca_predictions.head(5)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_19372\928028864.py in <module> 2 # Make a column for the predictions using the pca data 3 # Will compare clusters ----> 4 df_market_pca_predictions = df_market_predictions.copy() 5 df_market_pca_predictions["predictions_pca"] = predictions_pca 6 df_market_pca_predictions.head(5) NameError: name 'df_market_predictions' is not defined
clusters_pca = df_market_pca_predictions.hvplot.scatter(
x = "price_change_percentage_24h",
y = "price_change_percentage_7d",
by = "predictions_pca",
hover_cols = 'index'
)
clusters_pca
elbow_og * elbow_pca
clusters_og * clusters_pca
I don't actually see a difference! The predictions for both were the same in this case.
# Composition of the elbow plots
elbow_og + elbow_pca
# Composition of the scatter plots
clusters_og+clusters_pca
#plot dendogram. If we cut at y=17, we get 5 clusters
from scipy.cluster import hierarchy as sch
plt.figure(figsize=(10, 7))
plt.title("Dendrograms")
dend3 = sch.dendrogram(sch.linkage(df_market_scaled, method='ward'))
plt.xticks(rotation=90)
plt.gca().set_xticklabels(df_market_scaled.index[dend3['leaves']])
plt.axhline(y=6, color='k', linestyle='--')